Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path= './data/NCDC/europe/uk/marham/dat.txt' 
file_path, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 4 
# file_path, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 4
# file_path= './data/NCDC/europe/uk/middle_wallop/dat.txt' 
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 4 
# file_path= "./data/NCDC/europe/germany/neuburg/dat.txt"
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # time shift
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path= './data/NCDC/europe/spain/almeria/dat.txt' # negative dimensions?
# file_path= './data/NCDC/europe/greece/eleftherios_intl/dat.txt'
# file_path= './data/NCDC/europe/greece/elefsis/dat.txt' # bad dataset
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth= "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt" , 0.6
# file_path= "./data/NCDC/cn/shanghai/pudong/dat.txt"
# file_path= "./data/NCDC/cn/hefei_luogang/dat.txt" # few 0, trend, try 2
# file_path= "./data/NCDC/cn/nanjing_lukou/dat.txt" 
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path= "./data/NCDC/oceania/auckland_intl/dat.txt"  # Good data, Weird KDE shape, might be blocked?
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/canberra/dat.txt" # high 0, numpy problem

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, incomplete dataset

# file_path = './data/asos/denver/hr_avg.csv'
# file_path = './data/asos/bismarck_ND/hr_avg.csv' # try 4
# file_path = './data/asos/aberdeen_SD/hr_avg.csv' # only to 2012, good fit, try 2
# file_path = './data/asos/minneapolis/hr_avg.csv'
# file_path = './data/asos/lincoln_NE/hr_avg.csv' 
# file_path = './data/asos/des_moines_IA/hr_avg.csv'
# file_path = './data/asos/springfield_IL/hr_avg.csv' # good fit
# file_path = './data/asos/topeka/hr_avg.csv' # High 0

# file_path = './data/NDAWN/baker/hr_avg.csv' # 4 might be better
# file_path = './data/NDAWN/dickinson/hr_avg.csv'
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = False
else:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19790101 0000 FM-12 230 3.6 N
1 19790101 0100 FM-12 250 5.1 N
2 19790101 0200 FM-12 250 7.7 N
3 19790101 0300 FM-12 270 9.3 N
4 19790101 0400 FM-12 300 11.3 N
5 19790101 0500 FM-12 330 10.3 N
6 19790101 0600 FM-12 330 9.3 N
7 19790101 0700 FM-12 360 6.7 N
8 19790101 0800 FM-12 310 6.7 N
9 19790101 0900 FM-12 290 4.1 N
10 19790101 1000 FM-12 280 8.2 N
11 19790101 1100 FM-12 290 6.2 N
12 19790101 1200 FM-12 20 3.1 N
13 19790101 1300 FM-12 280 8.2 N
14 19790101 1400 FM-12 280 7.7 N
15 19790101 1500 FM-12 280 10.3 N
16 19790101 1600 FM-12 290 3.6 N
17 19790101 1700 FM-12 300 9.3 N
18 19790101 1800 FM-12 290 4.6 N
19 19790101 1900 FM-12 340 5.7 N
20 19790101 2000 FM-12 60 3.6 N
21 19790101 2100 FM-12 110 3.6 N
22 19790101 2200 FM-12 90 3.6 N
23 19790101 2300 FM-12 320 3.6 N
24 19790102 0000 FM-12 280 14.4 N
25 19790102 0100 FM-12 270 13.4 N
26 19790102 0200 FM-12 280 13.4 N
27 19790102 0300 FM-12 280 11.8 N
28 19790102 0400 FM-12 290 8.2 N
29 19790102 0500 FM-12 290 6.2 N
... ... ... ... ... ... ...
486093 20160801 1350 FM-15 170 3.1 V
486094 20160801 1400 FM-12 170 3.1 N
486095 20160801 1420 FM-15 150 2.1 V
486096 20160801 1450 FM-15 30 2.6 V
486097 20160801 1500 FM-12 30 2.6 N
486098 20160801 1520 FM-15 999 1.5 V
486099 20160801 1550 FM-15 130 2.1 N
486100 20160801 1600 FM-12 130 2.1 N
486101 20160801 1620 FM-15 100 2.1 V
486102 20160801 1650 FM-15 110 3.1 V
486103 20160801 1700 FM-12 110 3.1 N
486104 20160801 1720 FM-15 70 3.1 V
486105 20160801 1750 FM-15 50 4.6 V
486106 20160801 1800 FM-12 50 4.6 N
486107 20160801 1850 FM-15 60 5.1 N
486108 20160801 1900 FM-12 60 5.1 N
486109 20160801 1920 FM-15 60 5.1 N
486110 20160801 1950 FM-15 60 4.1 N
486111 20160801 2000 FM-12 60 4.1 N
486112 20160801 2020 FM-15 50 4.1 N
486113 20160801 2050 FM-15 50 3.6 N
486114 20160801 2100 FM-12 50 3.6 N
486115 20160801 2120 FM-15 50 3.6 N
486116 20160801 2150 FM-15 60 3.6 N
486117 20160801 2200 FM-12 60 3.6 N
486118 20160801 2220 FM-15 60 3.6 N
486119 20160801 2250 FM-15 60 3.1 N
486120 20160801 2300 FM-12 60 3.1 N
486121 20160801 2320 FM-15 70 4.1 N
486122 20160801 2350 FM-15 70 4.1 N

486123 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
56.499 -6.869
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
              (date >= 19700000) & (date < 20170000) ")
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates 
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
date HrMn dir speed month dir_windrose
count 4.853830e+05 485383.000000 485383.000000 485383.000000 485383.000000 485383.000000
mean 2.000788e+07 1153.725957 205.073847 7.268022 6.484026 204.830806
std 1.142918e+05 670.111036 129.567515 3.840033 3.442359 134.306571
min 1.979010e+07 0.000000 0.000000 0.000000 1.000000 0.000000
25% 1.991013e+07 600.000000 130.000000 4.600000 4.000000 140.000000
50% 2.003032e+07 1120.000000 200.000000 6.700000 6.000000 200.000000
75% 2.011102e+07 1700.000000 270.000000 9.800000 9.000000 270.000000
max 2.016080e+07 2350.000000 999.000000 38.600000 12.000000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xe42c828>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
if 'knot_unit' not in globals():
    knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    if knot_unit:
        df['speed'] = df['speed'] * 1.943845
        df['decimal'] = df.speed % 1
        df.decimal.hist(alpha=0.5, label='knot')
        # need more elaboration, some is not near an integer
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
True
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='> %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xe3d4a90>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type month dir_windrose
time

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1991-01-03 06:00:00 19910103 600 FM-12 200 70 N 1 250 44.0 46.0
2004-04-21 11:00:00 20040421 1100 FM-12 290 66 N 4 160 33.0 32.0
1983-10-18 14:00:00 19831018 1400 FM-12 210 61 N 10 240 31.0 24.0
1979-05-09 01:00:00 19790509 100 FM-12 140 61 N 5 310 45.0 48.0
1989-02-13 15:00:00 19890213 1500 FM-12 160 56 N 2 290 6.0 6.0
1993-01-21 21:00:00 19930121 2100 FM-12 180 56 N 1 270 8.0 6.0
1993-01-17 05:00:00 19930117 500 FM-12 180 55 N 1 270 12.0 3.0
2011-12-08 14:00:00 20111208 1400 FM-12 180 55 N 12 270 3.0 6.0
1981-02-27 04:00:00 19810227 400 FM-12 320 55 N 2 130 19.0 22.0
1979-06-06 04:00:00 19790606 400 FM-12 250 55 N 6 200 49.0 48.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xd3fc160>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 22
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1993-01-21 21:00:00 19930121 2100 FM-12 180 56 N 1 270 8.0 6.0
1989-02-13 15:00:00 19890213 1500 FM-12 160 56 N 2 290 6.0 6.0
1993-01-17 05:00:00 19930117 500 FM-12 180 55 N 1 270 12.0 3.0
1981-02-27 04:00:00 19810227 400 FM-12 320 55 N 2 130 19.0 22.0
2011-12-08 14:00:00 20111208 1400 FM-12 180 55 N 12 270 3.0 6.0
1979-12-17 01:00:00 19791217 100 FM-12 210 54 N 12 240 1.0 2.0
1984-01-21 19:00:00 19840121 1900 FM-12 310 54 N 1 140 4.0 2.0
2011-12-08 12:00:00 20111208 1200 FM-12 190 54 N 12 260 2.0 2.0
2008-01-09 05:00:00 20080109 500 FM-12 170 53 N 1 280 13.0 15.0
1996-11-06 05:00:00 19961106 500 FM-12 210 53 N 11 240 34.0 5.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0       2920
10      2831
20      4235
30      4516
40      4623
50      4010
60      5202
70      6821
80      8805
90      8527
100     8698
110     7263
120     7665
130     7077
140     8106
150     8517
160    10467
170    10216
180    11171
190    11178
200    12361
210    11430
220    12151
230    10247
240    10279
250     9925
260    12179
270    11267
280    12124
290    11058
300    12298
310    10270
320     8852
330     5863
340     4178
350     2837
999     4034
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.00653928680903
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
0    4031
2       2
3       1
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df.query('date < @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1979 - 1979
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1980 - 1984
1985 - 1989
1990 - 1994
1995 - 1999
2000 - 2004
2005 - 2009
2010 - 2014
2015 - 2016
In [24]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[24]:
(0, 25.0)
In [25]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type month dir_windrose
time
In [26]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 5000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 5000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? True
Report type used: FM-12
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed month dir_windrose x y
count 4.283700e+04 42837.000000 42837.000000 42837.000000 42837.000000 42837.000000 42837.000000 42837.000000
mean 2.012072e+07 1148.978687 193.699604 14.397208 6.518150 191.330789 -2.265091 -2.886202
std 1.413668e+04 693.172819 90.987991 7.308910 3.459189 102.782601 10.601868 11.612230
min 2.010010e+07 0.000000 -4.985714 0.019023 1.000000 0.000000 -55.087745 -48.092642
25% 2.011040e+07 500.000000 122.446319 8.805925 4.000000 130.000000 -9.357165 -11.028079
50% 2.012070e+07 1100.000000 202.245381 13.504172 7.000000 200.000000 -2.134838 -3.083125
75% 2.013100e+07 1800.000000 270.735440 18.977824 10.000000 270.000000 5.010052 5.509173
max 2.014123e+07 2300.000000 354.982337 55.210478 12.000000 999.000000 36.278378 37.824014
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x251bc7f0>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x261b7240>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

2.3 Overview by Month

In [41]:
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre): 
    end_month = month+month_incre
    sub_df = current_df.query('(month >= @month) and (month < @end_month)')
    if len(sub_df) > 0:
        if month_incre == 1:
            title = 'Month: %s' % (month)
        else:
            title = 'Month: %s - %s ' % (month, end_month-1)
        ax = WindroseAxes.from_ax()
        ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
        plt_configure(figsize=(3,3), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

3. Create input data and configuration

In [42]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [43]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-28 -27 -26 -25 -24 -23 -22 -21 -20 -19 -18 -17 -16 -15 -14 -13 -12 -11
 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25
  26  27  28]
In [44]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [45]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [46]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
1.9
Wall time: 1min 47s
In [47]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 1.9 3249
[  3.82297160e-06   5.36005874e-06   6.93590100e-06   8.35927712e-06
   9.65228938e-06]
In [48]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
In [49]:
kde_cdf = cdf_from_pdf(kde_result)

5. GMM by Expectation-maximization

In [50]:
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [51]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[51]:
weight mean_x mean_y sig_x sig_y corr
1 0.286 1.254 7.053 6.848 8.075 0.129
2 0.279 -6.209 -7.781 7.836 7.269 0.132
3 0.232 7.369 -14.315 8.217 8.257 0.034
4 0.204 -12.804 2.915 8.344 8.167 0.080
In [52]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.285572492719 [[ 1.25389295  7.05341572]] [ 6.66665539  8.22611252] 161.018756295
0.278576485086 [[-6.20900192 -7.78146088]] [ 6.96267676  8.10934431] -59.8778103483
0.232341818395 [[  7.36929272 -14.31534262]] [ 8.09489273  8.37665614] 139.005638925
0.2035092038 [[-12.8037502    2.91545152]] [ 7.90617546  8.59129242] -52.517883782
In [53]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [54]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[54]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.972 0.010 0.026 3.045621e-09 0.043 0.184

6. GMM by Optimization

In [55]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [56]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[56]:
     fun: -20.667158082492286
     jac: array([ -6.07785940e-01,  -2.38418579e-07,  -2.38418579e-07,
        -2.38418579e-07,   0.00000000e+00,  -1.90734863e-06,
        -6.07813358e-01,  -4.76837158e-07,  -4.76837158e-07,
        -2.38418579e-07,  -2.38418579e-07,   2.38418579e-07,
        -6.07758284e-01,  -2.38418579e-07,   0.00000000e+00,
        -2.38418579e-07,  -4.76837158e-07,  -9.53674316e-07,
        -6.07796431e-01,  -4.76837158e-07,   0.00000000e+00,
        -7.15255737e-07,   0.00000000e+00,   2.14576721e-06,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 3060
     nit: 117
    njev: 117
  status: 0
 success: True
       x: array([  0.27025892,  -1.86397684,   8.30651322,   7.91539428,
         7.51734246,   0.36200576,   0.07341415,   4.01465642,
        -8.77460495,   9.80413896,   5.17946032,  -0.75302674,
         0.03409677,   5.82530079,   3.36256866,   2.80880778,
         3.5458152 ,  -0.20307071,   0.62223015,  -4.03191533,
        -7.60276733,  12.2518056 ,  10.16645533,  -0.33446368])

6.1 GMM Result

In [57]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[57]:
weight mean_x mean_y sig_x sig_y corr
1 0.622 -4.032 -7.603 12.252 10.166 -0.334
2 0.270 -1.864 8.307 7.915 7.517 0.362
3 0.073 4.015 -8.775 9.804 5.179 -0.753
4 0.034 5.825 3.363 2.809 3.546 -0.203
In [58]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.622230151725 [[-4.03191533 -7.60276733]] [  8.8860637  13.2098988] -120.351832507
0.270258922839 [[-1.86397684  8.30651322]] [ 6.15010034  9.01887854] -49.0576534195
0.0734141538916 [[ 4.01465642 -8.77460495]] [  3.14217612  10.63365785] -113.910557547
0.0340967715441 [[ 5.82530079  3.36256866]] [ 2.67150005  3.65038272] -159.59171308

6.2 Goodness-of-fit statistics

In [59]:
gof_df(gmm_pdf_result, kde_result)
Out[59]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.018 1.057712e-09 0.025 0.108
In [60]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
In [61]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='', ylabel='', colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='', ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [62]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V
In [63]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 3. GMM distribution
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'})
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: divide by zero encountered in log
In [64]:
# Calculate Speed Distribution
# 1. GMM Model
x = arange(0, max_speed, 0.5)
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])*len(df.speed)/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm,'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)
Speed Distribution Comparison
In [65]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
title='Direction Distribution Comparison'
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print(title)
Direction Distribution Comparison
In [66]:
# %%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.911060841151

6.3 Sectoral Comaprison

In [67]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    max_diff_array = []
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = radians(angle), radians(incre)  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 3.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
#         plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 3.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
#         plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 3.3. Weibull Comparison
#         ax3 = fig.add_subplot(1,3,3)
#         plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
#         plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
#         plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
#         plt.gca().set_xlim(right = log(max_speed+1))
# #         plt_configure(xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
#         plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        diff, diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
        max_diff_array.append([len(sub_df), angle, diff.max(), x[diff.argmax()], 
                               diff_weibull.max(), x[diff_weibull.argmax()], R_square_gmm, R_square_weibull])
        curves = {'angle': angle, 'data_size': data_size, 'weight': direction_prob, 
                  'x': x, 'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', diff.max(), diff_weibull.max(), 'speed value:', x[diff.argmax()], x[diff_weibull.argmax()], 'y gmm', y_cdf_gmm[diff.argmax()])
        print(' ')
    return max_diff_array, curve_collection
In [68]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
max_diff_array, curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 677 weight 0.015804094591124494
GMM Weibull
R square 0.812099034927 0.812487960177
max diff: 0.0513573036973 0.0842356082036 speed value: 6.58162347878 3.94897408727 y gmm 0.319396019789
 
25.0 (15.0 - 35.0) degree
data size: 1348 weight 0.03146812335130845
GMM Weibull
R square 0.906127142222 0.859294283189
max diff: 0.0373884436327 0.0980326911915 speed value: 8.11157560821 4.86694536492 y gmm 0.464837075655
 
45.0 (35.0 - 55.0) degree
data size: 1339 weight 0.031258024604897634
GMM Weibull
R square 0.908881659425 0.873689769832
max diff: 0.0569979491126 0.0473911353572 speed value: 7.91590166873 3.95795083436 y gmm 0.322389653576
 
65.0 (55.0 - 75.0) degree
data size: 2180 weight 0.05089058524173028
GMM Weibull
R square 0.969051536147 0.973349187846
max diff: 0.0168187753153 0.0146372208688 speed value: 9.9452611016 13.9233655422 y gmm 0.345717857884
 
85.0 (75.0 - 95.0) degree
data size: 2183 weight 0.05096061815720055
GMM Weibull
R square 0.948136675234 0.960228012437
max diff: 0.046627691951 0.0399965561883 speed value: 16.0719690856 16.0719690856 y gmm 0.691347571448
 
105.0 (95.0 - 115.0) degree
data size: 1983 weight 0.046291757125849146
GMM Weibull
R square 0.946629493529 0.943739850372
max diff: 0.0369784667564 0.0543072548198 speed value: 5.7760769521 15.4028718723 y gmm 0.123211447089
 
125.0 (115.0 - 135.0) degree
data size: 2007 weight 0.046852020449611315
GMM Weibull
R square 0.962167135265 0.947976177174
max diff: 0.0399964550621 0.0353358187685 speed value: 22.5288439179 5.00640975952 y gmm 0.952303380822
 
145.0 (135.0 - 155.0) degree
data size: 2353 weight 0.054929150033849246
GMM Weibull
R square 0.956806029046 0.937394577964
max diff: 0.0236220482054 0.0356141259304 speed value: 5.35999200854 16.0799760256 y gmm 0.0984201782521
 
165.0 (155.0 - 175.0) degree
data size: 2785 weight 0.06501388986156827
GMM Weibull
R square 0.957355686526 0.966422672687
max diff: 0.0428842377951 0.0124046791099 speed value: 24.3792367076 8.12641223588 y gmm 0.844727970463
 
185.0 (175.0 - 195.0) degree
data size: 3167 weight 0.07393141443144945
GMM Weibull
R square 0.934239563086 0.961696191298
max diff: 0.0815087146859 0.0483799931154 speed value: 20.3407025544 14.5290732531 y gmm 0.7176703191
 
205.0 (195.0 - 215.0) degree
data size: 3267 weight 0.07626584494712516
GMM Weibull
R square 0.96688851759 0.979107822739
max diff: 0.0522236186251 0.0172089238756 speed value: 23.9556850382 23.9556850382 y gmm 0.858703837757
 
225.0 (215.0 - 235.0) degree
data size: 3333 weight 0.0778065690874711
GMM Weibull
R square 0.962315926495 0.96633560063
max diff: 0.0322143204897 0.011390525926 speed value: 20.2359457941 13.4906305294 y gmm 0.773066207563
 
245.0 (235.0 - 255.0) degree
data size: 2665 weight 0.06221257324275743
GMM Weibull
R square 0.932831040405 0.939328212422
max diff: 0.0424607220915 0.021020591081 speed value: 12.6187204746 10.0949763797 y gmm 0.419452973969
 
265.0 (255.0 - 275.0) degree
data size: 3307 weight 0.07719961715339543
GMM Weibull
R square 0.96482799769 0.964513859397
max diff: 0.0402895200889 0.0194385106768 speed value: 17.7835964285 10.162055102 y gmm 0.648091152989
 
285.0 (275.0 - 295.0) degree
data size: 3132 weight 0.07311436375096295
GMM Weibull
R square 0.95194847606 0.955981731306
max diff: 0.0458963736856 0.0243327026438 speed value: 12.5494255723 12.5494255723 y gmm 0.366300305752
 
305.0 (295.0 - 315.0) degree
data size: 3596 weight 0.08394612134369821
GMM Weibull
R square 0.965303445486 0.979842881058
max diff: 0.0292533384613 0.0146703362085 speed value: 12.8350017606 10.2680014085 y gmm 0.332645996971
 
325.0 (315.0 - 335.0) degree
data size: 2358 weight 0.05504587155963303
GMM Weibull
R square 0.913064092059 0.948424416821
max diff: 0.0446177487441 0.0289753587766 speed value: 24.7945043127 24.7945043127 y gmm 0.803558671951
 
345.0 (335.0 - 355.0) degree
data size: 950 weight 0.02217708989891916
GMM Weibull
R square 0.904226845447 0.929926515633
max diff: 0.0643423225102 0.0497678252025 speed value: 15.5814474395 12.1189035641 y gmm 0.681973466963
 
Wall time: 1min 23s
In [69]:
diff_df = pd.DataFrame(max_diff_array,columns=['datasize','direction', 'gmm', 'speed_gmm',
                                               'weibull', 'speed_weibull', 'r_square_gmm', 'r_square_weibull'])  

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9468165473417283 0.9515174510838722
In [70]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.gmm, diff_df.weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.04284196535641971 0.030051320705136466
In [71]:
# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation

6.4.1 Data Variability, by Bootstrap (Resampling)

In [72]:
max_diff_element = max(max_diff_array, key=lambda x: x[2])
angle =  max_diff_angle = max_diff_element[1]
incre = rebinned_angle
In [73]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [74]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
185.0 (175.0 - 195.0) Degree Speed Distribution
0.0966739118192 20.5 0.723345033555

6.4.2 Time Variability

In [75]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
    end_time = start_time + 50000 
    time_label = start_time//10000
    df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = time_label)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
        title = '%s - %s' %(time_label, time_label+4)
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = time_label*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if time_label == 2010 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: RuntimeWarning: divide by zero encountered in log
185.0 (175.0 - 195.0) Degree Speed Distribution

6.4.3 Adjacent Sector Variability

In [76]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [77]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('angle == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['data_size'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
185.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [78]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
1.9 square_error

7.1 Variability of the Result

In [79]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.543 -3.169 -8.800 12.288 10.230 -0.361
2 0.345 -3.289 6.519 8.613 8.790 0.399
3 0.070 4.758 -9.115 8.565 4.647 -0.690
4 0.042 5.502 3.873 3.239 4.862 -0.468
GMM Plot Result
0.543239718383 [[-3.16871939 -8.79973142]] [  8.76424149  13.37230327] -121.491299774
0.344533902568 [[-3.28886219  6.51867508]] [  6.74287596  10.29458452] 136.460366846
0.0701138898297 [[ 4.75758097 -9.11490794]] [ 3.11882489  9.23181213] -113.362203913
0.0421124892196 [[ 5.50231729  3.87338887]] [ 2.68071168  5.19073187] -155.861684316
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.007 0.019 1.153451e-09 0.026 0.113
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.017 1.112941e-09 0.026 0.111

weight mean_x mean_y sig_x sig_y corr
1 0.589 -3.565 -8.228 12.171 9.954 -0.325
2 0.295 -2.807 7.648 8.352 8.022 0.388
3 0.070 4.374 -8.906 9.134 4.899 -0.701
4 0.046 5.705 4.068 3.202 4.245 -0.315
GMM Plot Result
0.588782863344 [[-3.56548533 -8.22784937]] [  8.78844666  13.03810235] -119.033733635
0.29499293962 [[-2.80702574  7.64843962]] [ 6.39467657  9.65508292] -47.9611433973
0.0699192265501 [[ 4.3739972  -8.90562296]] [ 3.24397585  9.84406536] -113.266233745
0.0463049704861 [[ 5.70486085  4.06772031]] [ 2.88969198  4.4631127 ] -156.08098798
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.019 1.145419e-09 0.027 0.113
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.018 1.117862e-09 0.026 0.111

weight mean_x mean_y sig_x sig_y corr
1 0.505 -3.162 -9.258 12.290 10.267 -0.393
2 0.366 -3.516 5.626 8.794 9.103 0.359
3 0.075 5.649 -9.503 8.177 4.449 -0.662
4 0.053 5.170 5.202 3.695 5.661 -0.527
GMM Plot Result
0.505409482307 [[-3.16239723 -9.25839957]] [  8.58220106  13.52082678] -122.642475148
0.36632038399 [[-3.51605227  5.62649731]] [  7.1573066  10.4385021] 137.750545998
0.0749592814813 [[ 5.64915692 -9.50252367]] [ 3.10519239  8.77555329] -112.840690135
0.0533108522216 [[ 5.16974512  5.20164191]] [ 2.91443495  6.09948982] -154.90959682
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.007 0.018 1.187120e-09 0.026 0.115
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.011 0.016 1.165587e-09 0.027 0.114

weight mean_x mean_y sig_x sig_y corr
1 0.621 -3.925 -7.603 12.246 10.157 -0.320
2 0.281 -2.266 8.020 8.098 7.797 0.386
3 0.066 5.005 -9.285 8.832 5.075 -0.728
4 0.033 5.918 3.476 2.745 3.459 -0.162
GMM Plot Result
0.62070730373 [[-3.9245286  -7.60322849]] [  8.9681734   13.14215107] -119.761392045
0.280555321188 [[-2.26604158  8.02003341]] [ 6.22309544  9.36242572] -47.8071046012
0.0656250819148 [[ 5.004699  -9.2846876]] [ 3.1769675   9.67792471] -115.653097724
0.0331122931677 [[ 5.91825143  3.47630736]] [ 2.65608466  3.52766272] -162.634857385
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.018 1.049674e-09 0.025 0.108
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.017 1.103346e-09 0.026 0.111

weight mean_x mean_y sig_x sig_y corr
1 0.618 -3.983 -7.345 12.323 10.450 -0.326
2 0.266 -1.730 8.281 7.751 7.431 0.394
3 0.087 3.439 -8.586 10.473 5.524 -0.742
4 0.029 5.982 2.884 2.591 3.055 -0.010
GMM Plot Result
0.617872529746 [[-3.9829251  -7.34450241]] [  9.13414315  13.3276797 ] -121.53902505
0.266427643213 [[-1.72968013  8.2811308 ]] [ 5.90241082  8.97006357] -48.0614229032
0.0865833396198 [[ 3.43854872 -8.58566731]] [  3.42161771  11.3356945 ] -113.656909655
0.0291164874213 [[ 5.98213355  2.88431292]] [ 2.59079937  3.05581812] -178.228340579
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.019 1.087043e-09 0.025 0.110
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.009 0.018 1.141946e-09 0.026 0.113

weight mean_x mean_y sig_x sig_y corr
1 0.648 -4.193 -7.084 12.046 10.178 -0.306
2 0.247 -1.420 8.650 7.630 7.361 0.353
3 0.075 4.529 -9.178 10.171 5.877 -0.788
4 0.030 5.585 3.499 2.638 3.250 -0.185
GMM Plot Result
0.64757262805 [[-4.19337155 -7.08428113]] [  9.02335195  12.93370465] -120.536349672
0.246691161591 [[-1.41994644  8.64970771]] [ 6.02338694  8.72448879] -47.9040405558
0.0752456892949 [[ 4.5287664  -9.17791256]] [  3.26115353  11.28562789] -116.908223984
0.0304905210639 [[ 5.58460114  3.49890553]] [ 2.52207842  3.34094993] -159.318566088
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.009 0.020 1.196203e-09 0.026 0.115
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.011 0.018 1.108410e-09 0.026 0.111

weight mean_x mean_y sig_x sig_y corr
1 0.621 -4.187 -7.352 12.180 10.560 -0.317
2 0.262 -2.006 8.108 7.677 7.828 0.393
3 0.083 3.360 -8.661 10.209 4.890 -0.720
4 0.034 5.955 3.387 2.725 3.663 -0.134
GMM Plot Result
0.621149146675 [[-4.18709145 -7.35193941]] [  9.23091294  13.21540763] -122.840004901
0.262344204657 [[-2.00568234  8.10848189]] [ 6.04042089  9.14976601] 136.423561611
0.0826510782773 [[ 3.36017663 -8.66100555]] [  3.18975779  10.86069998] -110.918613341
0.033855570391 [[ 5.9552185  3.3867168]] [ 2.67220746  3.70129679] -168.000303471
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.019 1.095256e-09 0.025 0.110
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.010 0.018 1.131554e-09 0.026 0.112

weight mean_x mean_y sig_x sig_y corr
1 0.631 -4.167 -7.752 12.281 9.886 -0.346
2 0.275 -1.305 8.449 7.907 7.068 0.284
3 0.067 4.844 -9.046 9.473 5.130 -0.748
4 0.027 5.791 3.053 2.658 3.007 -0.073
GMM Plot Result
0.631475949585 [[-4.16720124 -7.75176424]] [  8.63432621  13.19171292] -118.870977095
0.275127808528 [[-1.30509495  8.44893374]] [ 6.25847964  8.56209673] -55.8087329104
0.0666131468208 [[ 4.84381052 -9.04611287]] [  3.12666007  10.30916394] -114.453752006
0.0267830950658 [[ 5.79062276  3.05265264]] [ 2.62776079  3.03375573] -164.685850158
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.007 0.021 1.143311e-09 0.027 0.113
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.008 0.020 1.142861e-09 0.026 0.113

weight mean_x mean_y sig_x sig_y corr
1 0.464 -9.204 -6.376 9.356 10.320 -0.417
2 0.311 1.294 7.365 7.212 7.235 -0.149
3 0.178 8.521 -11.145 7.585 6.794 -0.426
4 0.048 -5.769 -1.566 4.539 7.050 -0.728
GMM Plot Result
0.463697884621 [[-9.20413576 -6.37571546]] [  7.45736406  11.76562963] -141.617595623
0.31055353544 [[ 1.29412581  7.36499915]] [ 6.6626525  7.744177 ] -135.615045046
0.177786558286 [[  8.52129388 -11.14478589]] [ 5.40055283  8.63218405] -127.737210839
0.0479620216536 [[-5.76878891 -1.56602918]] [ 2.77396181  7.91288518] -150.998555903
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.985 0.015 0.069 1.658988e-09 0.032 0.136
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.984 0.016 0.062 1.722931e-09 0.032 0.138

weight mean_x mean_y sig_x sig_y corr
1 0.516 -3.405 -8.751 12.449 10.625 -0.381
2 0.347 -3.595 5.843 8.662 9.409 0.426
3 0.080 4.886 -9.478 8.761 4.550 -0.656
4 0.056 5.334 4.353 3.641 5.450 -0.467
GMM Plot Result
0.51649188861 [[-3.40492451 -8.75061021]] [  8.90904527  13.7291708 ] -123.658735293
0.346959560521 [[-3.5952631   5.84259674]] [  6.8097995   10.82529837] 140.494598174
0.08009344495 [[ 4.88561    -9.47832356]] [ 3.22624487  9.33015056] -111.502555748
0.0564551059196 [[ 5.33398527  4.35313397]] [ 3.01611636  5.81949623] -155.802962952
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.016 1.118817e-09 0.026 0.112
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.012 0.016 1.125149e-09 0.026 0.112
Wall time: 1min 28s

7.2 Cross-validation, to select the number of Gaussian

In [80]:
# df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn%400 == 0)')
In [81]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
Number of train/test dataset 32127.75 10709.25
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.078321 0.022049 8.565912e-09 0.070275 0.308435 0.921420
1 0.076029 0.021460 8.128246e-09 0.070890 0.300647 0.923881
2 0.076724 0.020246 8.322011e-09 0.070942 0.303908 0.923385
3 0.075982 0.021912 8.221839e-09 0.070940 0.302111 0.924874
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.082231 0.024304 8.148867e-09 0.074323 0.300859 0.923973
1 0.079622 0.019516 8.866828e-09 0.070709 0.313225 0.922192
2 0.080427 0.023880 8.801802e-09 0.071641 0.313003 0.918791
3 0.083945 0.023286 9.026231e-09 0.072253 0.316848 0.914667
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.032955 0.010650 2.699028e-09 0.039959 0.173130 0.975409
1 0.031790 0.010774 2.754048e-09 0.040260 0.174948 0.974369
2 0.034779 0.011149 2.797791e-09 0.041453 0.176157 0.974451
3 0.032439 0.012008 2.848338e-09 0.041739 0.177933 0.973421
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.049040 0.014426 3.486034e-09 0.046428 0.196791 0.966794
1 0.036915 0.011996 3.424484e-09 0.045011 0.194837 0.969432
2 0.044289 0.013499 3.263090e-09 0.042906 0.190760 0.969158
3 0.029075 0.015891 2.997338e-09 0.040844 0.182234 0.973410
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.053489 0.014625 2.207607e-09 0.035642 0.156546 0.979766
1 0.022753 0.009299 2.152612e-09 0.036329 0.154703 0.980062
2 0.045406 0.015554 2.097917e-09 0.035700 0.152655 0.980774
3 0.069889 0.014853 2.172677e-09 0.036630 0.155285 0.979823
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.057389 0.017191 2.541783e-09 0.041325 0.168142 0.976245
1 0.030665 0.009759 2.775022e-09 0.039502 0.175280 0.974841
2 0.063605 0.017375 2.799931e-09 0.040762 0.176307 0.973807
3 0.055259 0.023413 2.639160e-09 0.038388 0.171388 0.976273
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.018493 0.007487 1.013784e-09 0.024803 0.106096 0.990620
1 0.017935 0.007373 1.054744e-09 0.024912 0.108203 0.990321
2 0.018113 0.007016 1.178274e-09 0.026914 0.114457 0.989129
3 0.018435 0.007960 1.104755e-09 0.025765 0.110757 0.989810
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.025808 0.010849 1.860024e-09 0.032661 0.143789 0.983094
1 0.024193 0.010402 1.549395e-09 0.031721 0.131291 0.985545
2 0.020697 0.010280 1.316708e-09 0.027307 0.120736 0.987929
3 0.022146 0.008661 1.551411e-09 0.030584 0.131311 0.985769
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.014808 0.008396 7.741924e-10 0.021365 0.092717 0.992868
1 0.027677 0.006322 9.991523e-10 0.024682 0.105309 0.990793
2 0.249225 0.012990 1.096555e-09 0.025907 0.110437 0.989861
3 0.018864 0.007188 1.003119e-09 0.024381 0.105521 0.990765
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.017504 0.008147 1.256813e-09 0.027942 0.118188 0.988420
1 0.030990 0.008362 1.566917e-09 0.029001 0.132043 0.985576
2 0.186281 0.010618 1.633980e-09 0.030411 0.134423 0.985102
3 0.022541 0.012243 1.628535e-09 0.032098 0.134605 0.984975
Wall time: 2min 26s
In [82]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.076764 0.021417 8.309502e-09 0.070762 0.303775 0.923390
2 0.032991 0.011145 2.774801e-09 0.040853 0.175542 0.974413
3 0.047884 0.013583 2.157703e-09 0.036075 0.154797 0.980106
4 0.018244 0.007459 1.087889e-09 0.025599 0.109878 0.989970
5 0.077644 0.008724 9.682547e-10 0.024084 0.103496 0.991072
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.081556 0.022747 8.710932e-09 0.072232 0.310984 0.919906
2 0.039830 0.013953 3.292736e-09 0.043797 0.191155 0.969699
3 0.051730 0.016934 2.688974e-09 0.039994 0.172779 0.975292
4 0.023211 0.010048 1.569385e-09 0.030568 0.131782 0.985584
5 0.064329 0.009842 1.521561e-09 0.029863 0.129815 0.986019
In [83]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
R_square
K_S
Chi_square
In [84]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)